Wind energy has emerged as one of the most promising renewable energy sources worldwide. However, the operation and maintenance of wind turbines are costly, especially when failures occur unexpectedly. Traditional maintenance practices either schedule repairs too frequently (leading to unnecessary costs) or too late (resulting in catastrophic breakdowns and replacements).
ReneWind aims to leverage machine learning and predictive maintenance to accurately identify potential generator failures in wind turbines. By analyzing sensor data, the company seeks to predict failures before they happen, thereby reducing downtime, minimizing costs, and improving operational efficiency.
The data provided is a transformed version of the original data which was collected using sensors.
Both datasets consist of 40 predictor variables and 1 target variable.
# importing core libraries for data manipulation and visualisation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# importing scikit-learn utilities for preprocessing, model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
# to oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# importing tensorflow utilities
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings("ignore")
from google.colab import drive
drive.mount('/content/drive')
# loading data into a pandas dataframe
df_train = pd.read_csv("/content/drive/MyDrive/GL_Project/ReneWind/Train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/GL_Project/ReneWind/Test.csv")
# creating copy of the data
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()
# displaying first 5 rows of the train data
df_train.head(5)
Observation:
The first 5 rows of the train data along with the title of the column has been displayed here.
# displaying first 5 rows of the test data
df_test.head(5)
Observation:
The first 5 rows of the test data along with the title of the column has been displayed here.
# checking the shape of the train data
df_train.shape
Observation:
The data.shape is used to generate the number of rows and columns. Here, the train dataset has 20000 rows and 41 columns.
# checking the shape of the test data
df_test.shape
Observation:
The data.shape is used to generate the number of rows and columns. Here, the train dataset has 50000 rows and 41 columns.
# checking the attribute types of the train data
df_train.info()
Observation:
# checking the attribute types of the test data
df_test.info()
Observation:
# countplot for target variable distribution
sns.countplot(x = 'Target', data = df_train, palette = 'Set2')
plt.title('Distribution of Target Variable')
plt.show()
Observation:
The plot shows high imbalance in the data:
* Class 0 (No failures) has around 19000 samples.
* Class 1 (Failures) has around 2000 samples.
The ratio seems to be 90:10 with the Class 1 as minority class.
# univariate analysis
target_col = 'Target'
features = [col for col in df_train.columns if col != target_col] # to seprate features
for col in features:
plt.figure(figsize=(14,4))
# histogram
plt.subplot(1,3,1)
sns.distplot(df_train[col], kde=True, bins=30, color='red')
plt.title(f'Histogram distribution of {col}')
# kde
plt.subplot(1,3,2)
sns.kdeplot(df_train[col], shade=True, color='skyblue')
plt.title(f'KDE distribution of {col}')
# boxplot
plt.subplot(1,3,3)
sns.boxplot(y=df_train[col], color='lightgreen')
plt.title(f'Boxplot distribution of {col}')
plt.show()
Observation:
# bivariate analysis
for col in features:
plt.figure(figsize=(12,4))
# boxplot
plt.subplot(1,2,1)
sns.boxplot(x='Target', y=col, data=df_train, palette='Set2')
plt.title(f'Boxplot distribution of {col} vs Target')
# violin plot
plt.subplot(1,2,2)
sns.violinplot(x='Target', y=col, data=df_train, palette='muted')
plt.title(f'Violin plot distribution of {col} vs Target')
# heatmap
plt.figure(figsize=(15,10))
sns.heatmap(df_train.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
# pairplot
ss = features[:5] + [target_col]
sns.pairplot(df_train[ss], hue='Target')
plt.title('Pairplot of Features')
plt.show()
Observation:
# checking statistcal summary of train data
df_train.describe()
Observation:
The train data has minimum feature value is around -3.42 while the maximum feature value is around 23.63.
# checking statistcal summary of test data
df_test.describe()
Observation:
The test data has minimum feature value is around -17.24 while the maximum feature value is around 26.53.
# identifying target and features
X = df_train.drop('Target', axis=1)
y = df_train['Target'].copy()
X_test = df_test.drop(['Target'], axis=1)
y_test = df_test['Target'].copy()
# splitting data into train and validation to preprocess it
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
Observation:
# creating an instace of the imputer to be used
imputer = SimpleImputer(strategy="median")
# fit and transform the train data
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
# Transform the validation data
X_val = pd.DataFrame(imputer.transform(X_val), columns=X_train.columns)
# Transform the test data
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_train.columns)
# checking missing values
print(f"Missing values in X_train:\n{X_train.isnull().sum()}")
print(f"\nMissing values in X_val:\n{X_val.isnull().sum()}")
print(f"\nMissing values in X_test:\n{X_test.isnull().sum()}")
Observation:
There are no missing values in the given dataset of ReneWind.
# checking for duplicate values
df_train.duplicated().sum()
Observation:
There are no duplicate values in the given dataset of ReneWind.
# checking for outliers
for col in features[:5]:
plt.figure(figsize=(6,4))
sns.boxplot(x=df_train[col], color='lightgreen')
plt.title(f'Boxplot distribution of {col}')
plt.show()
out_sum = {}
for col in features:
if pd.api.types.is_numeric_dtype(df_train[col]): # only numeric features
Q1 = df_train[col].quantile(0.25)
Q3 = df_train[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = ((df_train[col] < lower_bound) | (df_train[col] > upper_bound)).sum()
out_sum[col] = outliers
Observation:
# adding scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
The nature of predictions made by the classification model will translate as follows:
# model evaluation
def evaluate_model(model, X, y, set_name="Set"):
X = np.array(X)
y = np.array(y)
prob = model.predict(X).ravel()
preds = (prob > 0.5).astype(int)
acc = accuracy_score(y, preds)
f1 = f1_score(y, preds)
roc = roc_auc_score(y, preds)
prec = precision_score(y, preds)
rec = recall_score(y, preds)
# try/except for ROC curve
try:
roc = roc_auc_score(y, prob)
except Exception:
roc = roc_auc_score(y, preds)
print(f"\tEvaluation on {set_name}\n")
print(f"Accuracy : {acc:.4f}")
print(f"F1 Score : {f1:.4f}")
print(f"ROC AUC Score : {roc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall : {rec:.4f}")
print("\nConfusion Matrix:\n", confusion_matrix(y, preds))
print("\nClassification Report:\n", classification_report(y, preds))
return {"accuracy": acc, "f1": f1, "roc_auc": roc, "precision": prec, "recall": rec}
# computing class weights for imbalance
class_weights_arr = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: w for i, w in enumerate(class_weights_arr)}
print("Class weights:", class_weights)
# building model
def build_model(layers, optimizer, dropout_rate=0.0, X_tr=None, y_tr=None, X_val=None, y_val=None, use_class_weights=False, epochs=50, batch_size=64, verbose=1):
modell = Sequential()
# input layer
modell.add(Dense(layers[0], activation='relu', input_dim = X_tr.shape[1]))
# hidden layer
for size in layers[1:]:
modell.add(Dense(size, activation='relu'))
if dropout_rate > 0:
modell.add(Dropout(dropout_rate))
modell.add(Dense(1, activation='sigmoid')) # output layer
modell.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) # compiling model
# callbacks
es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
print(f"\nTraining model: layers={layers}, optimizer={type(optimizer.name).__name__}, dropout={dropout_rate}, class_weights={use_class_weights}")
hist = modell.fit(X_tr, y_tr,
epochs=epochs, batch_size=64,
validation_data=(X_val, y_val),
verbose=verbose,
class_weight = class_weights if use_class_weights else None)
# evaluating model
ev_tr = evaluate_model(modell, X_tr, y_tr, set_name="Training")
ev_va = evaluate_model(modell, X_val, y_val, set_name="Validation")
return modell, hist
# model 1: baseline - shallow NN + SGD
m1, h1 = build_model([32, 16], SGD(learning_rate=0.01),
dropout_rate=0.0, X_tr=X_train_scaled, y_tr = y_train.values,
X_val = X_val_scaled, y_val = y_val.values,
use_class_weights=False, epochs=50, batch_size=64, verbose=1)
Observation:
# model 2: Deeper NN + SGD
m2, h2 = build_model([64, 32, 16], SGD(learning_rate=0.01),
dropout_rate=0.1, X_tr=X_train_scaled, y_tr = y_train.values,
X_val = X_val_scaled, y_val = y_val.values,
use_class_weights=False, epochs=50, batch_size=64, verbose=1)
Observation:
# model 3: NN + Adam Optimizer
m3, h3 = build_model([32, 16], SGD(learning_rate=0.001),
dropout_rate=0.0, X_tr=X_train_scaled, y_tr = y_train.values,
X_val = X_val_scaled, y_val = y_val.values,
use_class_weights=False, epochs=50, batch_size=64, verbose=1)
Observation:
# model 4: NN + Adam + Dropout
m4, h4 = build_model([64, 32], SGD(learning_rate=0.001),
dropout_rate=0.3, X_tr=X_train_scaled, y_tr = y_train.values,
X_val = X_val_scaled, y_val = y_val.values,
use_class_weights=False, epochs=50, batch_size=64, verbose=1)
Observation:
# model 5: NN + SGD + Class Weight
m5, h5 = build_model([32, 16], SGD(learning_rate=0.01),
dropout_rate=0.0, X_tr=X_train_scaled, y_tr = y_train.values,
X_val = X_val_scaled, y_val = y_val.values,
use_class_weights=False, epochs=50, batch_size=64, verbose=1)
Observation:
# model 6: Deep NN + Adam + Dropout + Class Weight
m6, h6 = build_model([128, 64, 32], Adam(learning_rate=0.001),
dropout_rate=0.4, X_tr=X_train_scaled, y_tr = y_train.values,
X_val = X_val_scaled, y_val = y_val.values,
use_class_weights=False, epochs=50, batch_size=64, verbose=1)
Observation: